import scrapy
import json
from nlproject.items import SingleItem

"""
使用方法:
scrapy crawl single -a start_urls='["http://news.baidu.com/"]' -a xpaths='{"first": "//a", "title": "text()", "link": "@href"}'       
"""

class SingleSpider(scrapy.Spider):
    """
    xpath提取目标
    """
    name = "url"

    def __init__(self, start_urls, xpaths, *args, **kwargs):
        """
        :param start_urls: 这里传json化的list
        :param xpaths: 这里传json化的dict
        """
        super(SingleSpider, self).__init__(*args, **kwargs)

        start_urls = json.loads(start_urls)
        self.start_urls = start_urls  # type: list
        print(self.start_urls)

        xpaths = json.loads(xpaths)
        self.xpaths = xpaths  # type: dict
        print(xpaths)

    def parse(self, response):
        first_xpath = self.xpaths.pop('first')  # 这个key是固定的
        for sel in response.xpath(first_xpath):
            tempDic = {}
            for k,v in self.xpaths.items():
                tempDic[k] = sel.xpath(v).extract_first()  # 精确匹配,这里只取第一个结果 TODO: 不确定对结果 是否有影响
            yield tempDic